library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.0     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggrepel)
library(ggcorrplot)
library(gmp)
## 
## Attaching package: 'gmp'
## The following objects are masked from 'package:base':
## 
##     %*%, apply, crossprod, matrix, tcrossprod
library(GGally)
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
bad_strains <- c('YBR084C-A', 'YBR098W', 'YCR036W',  'YCR044C',  'YCR077C',  'YHR090C', 
                 'YJL117W', 'YJL190C', 'YKR024C', 'YKR062W', 'YML008C', 'YML051W', 'YMR231W', 
                 'YNL330C', 'YPR072W', 'YDL192W', 'YDL082W', 'YDR443C', 'YDL135C', 'YDL135C')
##### bad strains are library strains to be removed (according to Amelie's file, and Hannes's instruction, those should be removed)


### hannes_averaged4 E-MAP score data for Gsp1 mutants
#### read it in and make it tidy
e.map <- read_tsv("basic_E-MAP_data/avg_merged_June2016_screen_for_Gia.txt", col_names = T)   ## use export for Gia because it has ORF names for library
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene = col_character()
## )
## See spec(...) for full column specifications.
e.map <- e.map %>% gather(library, score, -Gene) %>%
  filter(! library %in% bad_strains) %>%
  arrange(desc(Gene), desc(library)) %>%
  mutate("Gene" = gsub(Gene, pattern = "GSP1:", replacement = "", perl = T))

#### these are all the gsp1 mutants screened (including WT-NAT and 3xFLAG constructs)
### 59 query constructs all together
(Gsp1_queries <- unique(e.map[["Gene"]]))  
##  [1] "Y157A"         "Y148I"         "T34Y"          "T34S"         
##  [5] "T34Q"          "T34N"          "T34L"          "T34G"         
##  [9] "T34E"          "T34D"          "T34A"          "T139R"        
## [13] "T139A"         "T137G"         "R78K"          "R112S"        
## [17] "R112A"         "R108Y"         "R108S"         "R108Q"        
## [21] "R108L"         "R108I"         "R108G"         "R108D"        
## [25] "R108A"         "Q147L"         "Q147E"         "NTER3XFLAG_WT"
## [29] "N84Y"          "N105V"         "N105L"         "N102M"        
## [33] "N102K"         "N102I"         "K169I"         "K154M"        
## [37] "K143Y"         "K143W"         "K143H"         "K132H"        
## [41] "K129T"         "K129I"         "K129F"         "K129E"        
## [45] "K101R"         "H141V"         "H141R"         "H141I"        
## [49] "H141E"         "GSP1-NAT"      "G80A"          "F58L"         
## [53] "F58A"          "E115I"         "E115A"         "D79S"         
## [57] "D79A"          "CTER3XFLAG_WT" "A180T"
### ORF to gene name annotation from SGD
orf_gene_name_index <- read_delim("orf_gene_GO_sgd_annotation.txt", col_names = F, delim = "\t")
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_character(),
##   X3 = col_character()
## )
orf_index <- unique(tibble("orf" = orf_gene_name_index$X1, "gene_name" = orf_gene_name_index$X2))
rm(orf_gene_name_index)
#############################

### Gsp1 mutants merged with ubermap (merged with ubermap if there is an at least 100 library genes overlap between 
# chromatin library and the library against which that query was screened)
# (gene names) - this is only necessary to distinguish point mutants (they are all the same in the ORF file)
gene_names.ubermap <- read_delim("basic_E-MAP_data/20180108_gene_names_merge_w_Ubermap_100.txt", 
                           delim = "\t", col_names = T, n_max = length(Gsp1_queries))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene = col_character()
## )
## See spec(...) for full column specifications.
gene_names.ubermap <- gene_names.ubermap %>% gather(library.ORF, score, -Gene) %>%
  mutate("Gene" = gsub(Gene, pattern = "GSP1 - ", replacement = "", perl = T))
mutants <- unique(gene_names.ubermap$Gene)
rm(gene_names.ubermap)
### Gsp1 mutants merged with ubermap (orf names)
# replace Gsp1 ORFs with mutation names (because they are all the same orf!)
ubermap <- read_delim("basic_E-MAP_data/20180108_orf_names_merge_w_Ubermap_100.txt", col_names = T, delim = "\t")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene = col_character()
## )
## See spec(...) for full column specifications.
ubermap[ 1:length(mutants), 1 ] <- mutants

#### can't gather before sorting out those few Genes that occur multiple times (add a unique identifier to them)
### first find those Genes that are not unique (probably data from separate experiments/labs for the same thing? 
## Stll better not to mix them up)
(not_uniq <- ubermap %>% 
    group_by(Gene) %>% 
    summarize("count" = n()) %>% 
    filter(count > 1) %>% 
    ungroup()
)
## # A tibble: 10 x 2
##    Gene              count
##    <chr>             <int>
##  1 YDL155W - YDL155W     2
##  2 YDR113C - YDR113C     4
##  3 YER008C - YER008C     2
##  4 YGR038W - YGR038W     3
##  5 YGR108W - YGR108W     2
##  6 YGR109C - YGR109C     2
##  7 YJL085W - YJL085W     2
##  8 YLR350W - YLR350W     3
##  9 YPR119W - YPR119W     2
## 10 YPR120C - YPR120C     2
#### check correlations between non-unique samples
not_uniq_uber <- ubermap %>% 
  filter(Gene %in% not_uniq$Gene) %>% 
  arrange(Gene) %>% 
  mutate("Gene" = str_c(Gene, 1:24, sep = "_"))
not_uniq_matrix <- as.matrix(not_uniq_uber[, -1])
rownames(not_uniq_matrix) <- not_uniq_uber$Gene
cormat <- round(cor(t(not_uniq_matrix[, -1]), use = "pairwise.complete.obs"), 2)
####### correlations between duplicate data doesn't look that good
cormat %>% ggcorrplot(hc.order = TRUE, outline.col = "white",insig = "blank", tl.cex = 7) + 
  ggtitle("Pearson correlations between repeated screens") 

### these correlations look really bad
## check if this is due to the small overlap in libraries

not_uniq_uber_gather <- not_uniq_uber %>% 
  gather(library.ORF, score, -Gene) %>% 
  mutate("Gene.ORF" = gsub(Gene, pattern = " - .*", replacement = "", perl = T))
non_uniq_gene_ORFS <- not_uniq_uber_gather %>% pull(Gene.ORF) %>% unique()
plots <- list()
for (i in seq_along(non_uniq_gene_ORFS)) {
  plots[[i]] <- not_uniq_uber_gather %>% 
    filter(Gene.ORF == non_uniq_gene_ORFS[i]) %>%
    select(-Gene.ORF) %>% 
    spread(Gene, score) %>%
    select(-library.ORF) %>% 
    ggpairs()
}
print(plots) 
## [[1]]
## Warning: Removed 376 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 750 rows containing missing values
## Warning: Removed 750 rows containing missing values (geom_point).
## Warning: Removed 750 rows containing non-finite values (stat_density).

## 
## [[2]]
## Warning: Removed 767 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 784 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 767 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 782 rows containing missing values
## Warning: Removed 784 rows containing missing values (geom_point).
## Warning: Removed 766 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 766 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 779 rows containing missing values
## Warning: Removed 767 rows containing missing values (geom_point).
## Warning: Removed 766 rows containing missing values (geom_point).
## Warning: Removed 749 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 765 rows containing missing values
## Warning: Removed 782 rows containing missing values (geom_point).
## Warning: Removed 779 rows containing missing values (geom_point).
## Warning: Removed 765 rows containing missing values (geom_point).
## Warning: Removed 765 rows containing non-finite values (stat_density).

## 
## [[3]]
## Warning: Removed 900 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 972 rows containing missing values
## Warning: Removed 972 rows containing missing values (geom_point).
## Warning: Removed 260 rows containing non-finite values (stat_density).

## 
## [[4]]
## Warning: Removed 901 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 901 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 901 rows containing missing values
## Warning: Removed 901 rows containing missing values (geom_point).
## Warning: Removed 454 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 899 rows containing missing values
## Warning: Removed 901 rows containing missing values (geom_point).
## Warning: Removed 899 rows containing missing values (geom_point).
## Warning: Removed 899 rows containing non-finite values (stat_density).

## 
## [[5]]
## Warning: Removed 137 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 789 rows containing missing values
## Warning: Removed 789 rows containing missing values (geom_point).
## Warning: Removed 750 rows containing non-finite values (stat_density).

## 
## [[6]]
## Warning: Removed 398 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 775 rows containing missing values
## Warning: Removed 775 rows containing missing values (geom_point).
## Warning: Removed 761 rows containing non-finite values (stat_density).

## 
## [[7]]
## Warning: Removed 910 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 1076 rows containing missing values
## Warning: Removed 1076 rows containing missing values (geom_point).
## Warning: Removed 507 rows containing non-finite values (stat_density).

## 
## [[8]]
## Warning: Removed 891 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 892 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 891 rows containing missing values
## Warning: Removed 892 rows containing missing values (geom_point).
## Warning: Removed 65 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 892 rows containing missing values
## Warning: Removed 891 rows containing missing values (geom_point).
## Warning: Removed 892 rows containing missing values (geom_point).
## Warning: Removed 891 rows containing non-finite values (stat_density).

## 
## [[9]]
## Warning: Removed 749 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 749 rows containing missing values
## Warning: Removed 749 rows containing missing values (geom_point).
## Warning: Removed 67 rows containing non-finite values (stat_density).

## 
## [[10]]
## Warning: Removed 547 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 904 rows containing missing values
## Warning: Removed 904 rows containing missing values (geom_point).
## Warning: Removed 746 rows containing non-finite values (stat_density).

##### add a number in front of all the non-unique Gene
for (temp.gene in not_uniq$Gene) {
  row.names <- which(ubermap$Gene == temp.gene)
  for (i in seq_along(row.names)) {
    ubermap$Gene[row.names[i]] <- str_c(i, ubermap$Gene[row.names[i]], sep = "_")
  }
}
### now those few non-unique Genes look like this: 1_YDL155W - YDL155W
#### check
filter(ubermap, Gene == "1_YDL155W - YDL155W" | Gene == "2_YDL155W - YDL155W")
## # A tibble: 2 x 1,357
##   Gene     YAL002W YAL007C YAL010C YAL011W YAL013W YAL019W YAL020C YAL021C
##   <chr>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 1_YDL15…   -1.03      NA   0.653   0.349  -0.633   0.514   0.133   0.592
## 2 2_YDL15…    1.30      NA  NA      NA      NA      -1.39   NA      NA    
## # ... with 1,348 more variables: YAL023C <dbl>, YAL024C <dbl>,
## #   YAL026C <dbl>, YAL027W <dbl>, YAL040C <dbl>, YAL042W <dbl>,
## #   YAL048C <dbl>, YAL051W <dbl>, YAL053W <dbl>, YAL055W <dbl>,
## #   `YAR002C-A` <dbl>, YAR002W <dbl>, YAR003W <dbl>, YAR014C <dbl>,
## #   YAR018C <dbl>, YBL007C <dbl>, YBL008W <dbl>, YBL011W <dbl>,
## #   YBL017C <dbl>, YBL021C <dbl>, YBL024W <dbl>, YBL027W <dbl>,
## #   YBL032W <dbl>, YBL036C <dbl>, YBL046W <dbl>, YBL047C <dbl>,
## #   YBL051C <dbl>, YBL052C <dbl>, YBL058W <dbl>, YBL061C <dbl>,
## #   YBL066C <dbl>, YBL067C <dbl>, YBL072C <dbl>, YBL075C <dbl>,
## #   YBL078C <dbl>, YBL079W <dbl>, YBL081W <dbl>, YBL082C <dbl>,
## #   YBL085W <dbl>, YBL086C <dbl>, YBL087C <dbl>, YBL088C <dbl>,
## #   YBL101C <dbl>, YBL103C <dbl>, YBR009C <dbl>, YBR010W <dbl>,
## #   YBR015C <dbl>, YBR022W <dbl>, YBR023C <dbl>, YBR025C <dbl>,
## #   YBR031W <dbl>, YBR034C <dbl>, YBR036C <dbl>, YBR041W <dbl>,
## #   YBR057C <dbl>, YBR058C <dbl>, YBR061C <dbl>, YBR065C <dbl>,
## #   YBR069C <dbl>, YBR073W <dbl>, YBR076W <dbl>, YBR078W <dbl>,
## #   YBR082C <dbl>, YBR083W <dbl>, YBR088C_DAmP <dbl>, YBR093C <dbl>,
## #   YBR094W <dbl>, YBR095C <dbl>, YBR098W <dbl>, YBR103W <dbl>,
## #   YBR106W <dbl>, YBR107C <dbl>, YBR108W <dbl>, YBR114W <dbl>,
## #   YBR119W <dbl>, YBR125C <dbl>, YBR126C <dbl>, YBR131W <dbl>,
## #   YBR132C <dbl>, YBR135W_DAmP <dbl>, YBR141C <dbl>, YBR150C <dbl>,
## #   YBR156C <dbl>, YBR159W <dbl>, YBR160W_DAmP <dbl>, YBR162C <dbl>,
## #   `YBR162W-A` <dbl>, YBR164C <dbl>, YBR168W <dbl>, YBR171W <dbl>,
## #   YBR175W <dbl>, YBR181C <dbl>, YBR186W <dbl>, YBR193C_DAmP <dbl>,
## #   YBR194W <dbl>, YBR195C <dbl>, YBR200W <dbl>, YBR201W <dbl>,
## #   YBR212W <dbl>, YBR215W <dbl>, …
(not_uniq <- ubermap %>% group_by(Gene) %>% 
    summarize("count" = n()) %>% filter(count > 1))
## # A tibble: 0 x 2
## # ... with 2 variables: Gene <chr>, count <int>
# now I can gather the data
ubermap <- ubermap %>% gather(library.ORF, score, -Gene)


## mutants only
ubermap.mutants.only <- ubermap %>% filter( Gene %in% mutants) %>%
  #mutate("library.ORF" = gsub(library.ORF, pattern = "\\.", replacement = "-", perl = T)) %>% 
  ### this is to replace the YOR298C.A with YOR298C-A (R puts . instead of - when loading library genes as column names)
  ### deprecated step now that we have tidyverse!
  add_column("ORF" = "YLR293C") %>% 
  rename("Gene_uniq" = Gene) %>% 
  mutate("Gene.gene_name" = "GSP1") %>% 
  inner_join(., orf_index, by = c("library.ORF" = "orf")) %>%
  rename("library.gene_name" = gene_name)

# save the mutant only data
write_tsv(ubermap.mutants.only, path = "basic_E-MAP_data/preprocessed_ubermap_mut_only.txt")

ubermap.mutants.only %>% 
  ggplot(., mapping = aes(x = score)) + 
  geom_density() + ggtitle("Distribution of E-MAP scores in Gsp1 mutants")
## Warning: Removed 865 rows containing non-finite values (stat_density).

#### look at the distribution of scores for Gsp1 mutants
ubermap.mutants.only %>% 
  ggplot(ubermap.mutants.only, 
         mapping = aes(x = score, 
             y = reorder(Gene_uniq, score, FUN = function(x) quantile(x, prob = 0.05, na.rm = T)))) +
         geom_boxplot() +
        ylab("Mutants ordered by the 5th E-MAP score percentile") +
        xlab("E-MAP score")
## Warning: Removed 865 rows containing non-finite values (stat_boxplot).
## Warning: position_dodge requires non-overlapping x intervals

ubermap.per.mutant.95conf <- ubermap.mutants.only %>%
  group_by(Gene_uniq) %>%
  summarize(lower_quant = quantile(score, probs = .025, na.rm = T), 
            upper_quant = quantile(score, probs = .975, na.rm = T),
            upper_whisker = boxplot(score, plot = F)$stats[5, ],
            lower_whisker = boxplot(score, plot = F)$stats[1, ]) %>%
  mutate(category = factor(Gene_uniq, levels = Gene_uniq)) %>%
  arrange(lower_quant)
ubermap.per.mutant.95conf_strong <- ubermap.per.mutant.95conf %>% 
  filter(lower_quant < -3)
ubermap.per.mutant.95conf %>% 
  ggplot(., mapping = aes(x = lower_quant, y = upper_quant)) + geom_point() +
  geom_label_repel(data = ubermap.per.mutant.95conf_strong, aes(label = Gene_uniq)) +
  xlab("E-MAP score - lower 2.5 %") + ylab("E-MAP score - higher 97.5 %")

ubermap.per.mutant.95conf %>% 
  ggplot(., mapping = aes(x = lower_whisker, y = upper_whisker)) + geom_point() +
  geom_label_repel(data = ubermap.per.mutant.95conf_strong, aes(label = Gene_uniq)) +
  xlab("E-MAP score - lower whisker") + ylab("E-MAP score - upper whisker")

#ubermap.mutants.only <- ubermap.mutants.only %>% 
 # mutate(category = factor(Gene, levels = ubermap.per.mutant.95conf$Gene)) %>%  #### turn into factors so I can order by the lower quartile
#  arrange(category)

## tidy up all but mutants
ubermap.ubergenes.only <- ubermap %>% filter(! (Gene %in% mutants) ) %>%
  #### this is to make sure things like YLR337C - DELTA YLR338W - YLR337C - DELTA YLR338W are not collapse to just YLR337C in the next step
  mutate("Gene" = gsub(Gene, pattern = " - DELTA", replacement = " -- DELTA", perl = T)) %>%
  # this replacement is to remove the redundant ORF before the unique ORF (e.g. YFL008W_TSQ321 - YFL008W_TSQ321 -> keep only the first one) - > first one has the unique i_ for the few redundant ones!
  mutate("Gene" = gsub(Gene, pattern = " - .+$", replacement = "", perl = T))
  ### this is to replace the YOR298C.A with YOR298C-A (R puts . instead of - when loading library genes as column names) - deprecated with tidyverse
  #mutate("Gene" = gsub(Gene, pattern = "\\.", replacement = "-", perl = T))  



(deltaORFS <- ubermap.ubergenes.only %>% 
  filter(., grepl("DELTA", Gene)) %>% 
  pull(Gene) %>% unique())
##  [1] "YDR134C -- DELTA YDR133C" "YKL119C -- DELTA YKL118W"
##  [3] "YLR337C -- DELTA YLR338W" "YLR262C -- DELTA YLR261C"
##  [5] "YPR075C -- DELTA YPR075C" "YOL051W -- DELTA YOL050C"
##  [7] "YJL176C -- DELTA YJL175W" "YNL227C -- DELTA YNL228W"
##  [9] "YJL168C -- DELTA YJL169W" "YPL113C -- DELTA YPL114W"
## [11] "YNL236W -- DELTA YNL235C" "YNL297C -- DELTA YNL296W"
### most of the DELTA ORFs are dubious reading frames
### I will just use the first ORF for all of them

ubermap.ubergenes.only %>% 
  filter(Gene == "YDR134C -- DELTA YDR133C") %>% 
  mutate("ORF" = gsub(Gene, pattern = "^[0-9]+_", replacement = "", perl = T))
## # A tibble: 1,356 x 4
##    Gene                     library.ORF  score ORF                     
##    <chr>                    <chr>        <dbl> <chr>                   
##  1 YDR134C -- DELTA YDR133C YAL002W      1.13  YDR134C -- DELTA YDR133C
##  2 YDR134C -- DELTA YDR133C YAL007C     NA     YDR134C -- DELTA YDR133C
##  3 YDR134C -- DELTA YDR133C YAL010C     NA     YDR134C -- DELTA YDR133C
##  4 YDR134C -- DELTA YDR133C YAL011W      3.33  YDR134C -- DELTA YDR133C
##  5 YDR134C -- DELTA YDR133C YAL013W      1.48  YDR134C -- DELTA YDR133C
##  6 YDR134C -- DELTA YDR133C YAL019W      0.842 YDR134C -- DELTA YDR133C
##  7 YDR134C -- DELTA YDR133C YAL020C     NA     YDR134C -- DELTA YDR133C
##  8 YDR134C -- DELTA YDR133C YAL021C     NA     YDR134C -- DELTA YDR133C
##  9 YDR134C -- DELTA YDR133C YAL023C     NA     YDR134C -- DELTA YDR133C
## 10 YDR134C -- DELTA YDR133C YAL024C     NA     YDR134C -- DELTA YDR133C
## # ... with 1,346 more rows
#### in this step I make a column that has only the basic ORF - this is necessary so I can match to SGD gene name
ubermap.ubergenes.only <- ubermap.ubergenes.only %>% 
  rename("Gene_uniq" = "Gene") %>%
  mutate("ORF" = gsub(Gene_uniq, pattern = "^[0-9]+_", replacement = "", perl = T)) %>%
  mutate("ORF" = gsub(ORF, pattern = "_.+$", replacement = "", perl = T)) %>% 
  mutate("ORF" = gsub(ORF, pattern = " -- DELTA .+", replacement = "", perl = T))

ubermap.ubergenes.only %>% 
  filter(ORF == "YKL119C")
## # A tibble: 2,712 x 4
##    Gene_uniq                library.ORF   score ORF    
##    <chr>                    <chr>         <dbl> <chr>  
##  1 YKL119C -- DELTA YKL118W YAL002W      -1.54  YKL119C
##  2 YKL119C                  YAL002W      -6.82  YKL119C
##  3 YKL119C -- DELTA YKL118W YAL007C      NA     YKL119C
##  4 YKL119C                  YAL007C       0.428 YKL119C
##  5 YKL119C -- DELTA YKL118W YAL010C      NA     YKL119C
##  6 YKL119C                  YAL010C      NA     YKL119C
##  7 YKL119C -- DELTA YKL118W YAL011W       1.96  YKL119C
##  8 YKL119C                  YAL011W      -4.44  YKL119C
##  9 YKL119C -- DELTA YKL118W YAL013W       1.02  YKL119C
## 10 YKL119C                  YAL013W      NA     YKL119C
## # ... with 2,702 more rows
#### matching to SGD gene names
ubergenes.ubermap.gene_names <- inner_join(ubermap.ubergenes.only, orf_index, by = c("ORF" = "orf")) %>% 
  rename("Gene.gene_name" = "gene_name")
ubergenes.ubermap.gene_names_both <- inner_join(ubergenes.ubermap.gene_names, orf_index, by = c("library.ORF" = "orf")) %>%
  rename("library.gene_name" = "gene_name")
ubergenes.ubermap.gene_names_both %>% 
  filter(ORF == "YKL119C")
## # A tibble: 2,650 x 6
##    Gene_uniq     library.ORF   score ORF   Gene.gene_name library.gene_na…
##    <chr>         <chr>         <dbl> <chr> <chr>          <chr>           
##  1 YKL119C -- D… YAL002W      -1.54  YKL1… VPH2           VPS8            
##  2 YKL119C       YAL002W      -6.82  YKL1… VPH2           VPS8            
##  3 YKL119C -- D… YAL007C      NA     YKL1… VPH2           ERP2            
##  4 YKL119C       YAL007C       0.428 YKL1… VPH2           ERP2            
##  5 YKL119C -- D… YAL010C      NA     YKL1… VPH2           MDM10           
##  6 YKL119C       YAL010C      NA     YKL1… VPH2           MDM10           
##  7 YKL119C -- D… YAL011W       1.96  YKL1… VPH2           SWC3            
##  8 YKL119C       YAL011W      -4.44  YKL1… VPH2           SWC3            
##  9 YKL119C -- D… YAL013W       1.02  YKL1… VPH2           DEP1            
## 10 YKL119C       YAL013W      NA     YKL1… VPH2           DEP1            
## # ... with 2,640 more rows
write_tsv(ubergenes.ubermap.gene_names_both, path = "basic_E-MAP_data/preprocessed_ubermap_ubergenes_only.txt")

########### 95% confidence for WT (GSP1-NAT)
(wt <- ubermap.per.mutant.95conf %>% 
    filter(Gene_uniq == "GSP1-NAT") %>% 
    select(-Gene_uniq, -category))
## # A tibble: 1 x 4
##   lower_quant upper_quant upper_whisker lower_whisker
##         <dbl>       <dbl>         <dbl>         <dbl>
## 1       -1.20        1.05          1.09         -1.26
#### 95 % confidence for all the mutants data
(average.95conf <- ubermap.mutants.only %>% 
    summarize(lower_quant = quantile(score, probs = .025, na.rm = T), 
              upper_quant = quantile(score, probs = .975, na.rm = T),
              upper_whisker = boxplot(score, plot = F)$stats[5, ],
              lower_whisker = boxplot(score, plot = F)$stats[1, ]))
## # A tibble: 1 x 4
##   lower_quant upper_quant upper_whisker lower_whisker
##         <dbl>       <dbl>         <dbl>         <dbl>
## 1       -2.89        2.23          1.59         -1.65
(s.lim.point<-c(average.95conf$lower_quant, average.95conf$upper_quant))
##      2.5%     97.5% 
## -2.891910  2.225918
plot <- ggplot(ubermap.mutants.only, aes(x = score)) +
  facet_wrap( ~ category) +
  geom_density() +
  geom_vline(data = ubermap.per.mutant.95conf, aes(xintercept = lower_quant), size = 1) +
  geom_vline(data = ubermap.per.mutant.95conf, aes(xintercept = upper_quant), size = 1) +
  geom_vline(data = wt, aes(xintercept = lower_quant), color = "red", size = 1) + 
  geom_vline(data = wt, aes(xintercept = upper_quant), color = "red", size = 1) +
  geom_vline(data = average.95conf, aes(xintercept = lower_quant), color = "green", size = 1) +
  geom_vline(data = average.95conf, aes(xintercept = upper_quant), color = "green", size = 1) +
  geom_vline(data = ubermap.per.mutant.95conf, aes(xintercept = lower_whisker), color = "yellow", size = 1) +
  geom_vline(data = ubermap.per.mutant.95conf, aes(xintercept = upper_whisker), color = "yellow", size = 1) +
  geom_vline(data = wt, aes(xintercept = lower_whisker), color = "orange", size = 1) + 
  geom_vline(data = wt, aes(xintercept = upper_whisker), color = "orange", size = 1) +
  geom_vline(data = average.95conf, aes(xintercept = lower_whisker), color = "blue", size = 1) +
  geom_vline(data = average.95conf, aes(xintercept = upper_whisker), color = "blue", size = 1)
print(plot)
## Warning: Removed 51035 rows containing non-finite values (stat_density).

pdf(file = "Gsp1_mutants_score_distributions.pdf", width = 14)
print(plot)
## Warning: Removed 51035 rows containing non-finite values (stat_density).
dev.off()
## quartz_off_screen 
##                 2
####### What is the distribution of scores for the WT construct?
ubermap.mutants.only %>% filter(Gene_uniq == "GSP1-NAT") %>%
  ggplot(aes(x = score)) + geom_histogram() + ggtitle("Distribution of scores for GSP1-NAT")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7 rows containing non-finite values (stat_bin).

all.library.genes <- ubermap.mutants.only %>%
  select(library.ORF) %>% unique() %>% pull()
(length(all.library.genes))
## [1] 1325
##### What is the distribution of scores for the library genes that have a score of lower than 10 in at least one mutant?
highest_mut_emap_score_lib_genes <- ubermap.mutants.only %>% 
  filter(score < -10) %>% 
  pull(library.gene_name) %>% unique()
length(highest_mut_emap_score_lib_genes)
## [1] 81
ubermap.mutants.only %>% 
  filter(library.gene_name %in% highest_mut_emap_score_lib_genes) %>%
  ggplot(aes(x = score)) +
  facet_wrap(~library.gene_name) +
  geom_histogram() + ggtitle("Distribution of scores for most negative library genes")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5 rows containing non-finite values (stat_bin).

#### get library genes with the most broad score distributions (for Gsp1 mutants genetic interactions)
mut_lib_genes_score_range <- ubermap.mutants.only %>% 
  group_by(library.ORF) %>% 
  do(setNames(data.frame(t(range(.$score, na.rm = T))), c("score_min", "score_max"))) %>% 
  mutate("score_range" = score_max - score_min) %>% 
  arrange(desc(score_range))
mut_lib_genes_score_range %>% 
  ggplot(., aes(x = score_range)) + geom_density()

### there are 304 library gene with a score range larger of 7.5
(mut_lib_genes_score_range_7.5 <- mut_lib_genes_score_range %>% 
  filter(score_range > 7.5) %>% 
  inner_join(., orf_index, by = c("library.ORF" = "orf")) %>% 
  rename("library.gene_name" = "gene_name"))
## # A tibble: 304 x 5
## # Groups:   library.ORF [304]
##    library.ORF score_min score_max score_range library.gene_name
##    <chr>           <dbl>     <dbl>       <dbl> <chr>            
##  1 YLR018C         -19.2      5.03        24.3 POM34            
##  2 YDR395W         -17.4      5.84        23.3 SXM1             
##  3 YHR167W         -17.1      4.88        22.0 THP2             
##  4 YBL079W         -17.2      4.18        21.4 NUP170           
##  5 YDR363W-A       -17.4      3.87        21.3 SEM1             
##  6 YMR198W         -17.5      3.71        21.3 CIK1             
##  7 YMR153W         -18.1      2.33        20.4 NUP53            
##  8 YMR272C         -15.7      4.69        20.4 SCS7             
##  9 YML062C         -16.9      3.46        20.3 MFT1             
## 10 YPL018W         -17.1      2.99        20.1 CTF19            
## # ... with 294 more rows
write_tsv(mut_lib_genes_score_range_7.5, path = "library_genes_with_score_range_over_7.5_in_Gsp1_mut_screens.txt")
######### SIGNIFICANT LIBRARY GENES are defined as as genes that have an E-MAP score outside of the s.lim.point 
## with at least one of the Gsp1 point mutants:
# #### make a subset of ubergenes.ubermap.gene_names_both that only has 
## library genes that have scores outside the s.lim.point with at least one of the mutants
significant.library.genes <- ubermap.mutants.only %>% 
  filter(findInterval(score, s.lim.point) != 1) %>% 
  select(library.ORF) %>% unique() %>% pull()
(length(significant.library.genes))
## [1] 977
########### QUESTION: Should I remove from the list of significant.library.genes those that have significant scores with GSP1-NAT (wt control) 
### One could assume that those genetic interaction are an artifact of the construct
(sig.interactions.in.wt <- ubermap.mutants.only %>%
  filter(Gene_uniq == "GSP1-NAT" & findInterval(score, s.lim.point) != 1) %>% 
    inner_join(., orf_index, by = c("library.ORF" = "orf")) %>% 
    inner_join(., mut_lib_genes_score_range, by = "library.ORF"))
## # A tibble: 7 x 10
##   Gene_uniq library.ORF score ORF     Gene.gene_name library.gene_name
##   <chr>     <chr>       <dbl> <chr>   <chr>          <chr>            
## 1 GSP1-NAT  YBL079W      2.51 YLR293C GSP1           NUP170           
## 2 GSP1-NAT  YDR159W      3.78 YLR293C GSP1           SAC3             
## 3 GSP1-NAT  YDR363W-A    3.37 YLR293C GSP1           SEM1             
## 4 GSP1-NAT  YHR167W      4.81 YLR293C GSP1           THP2             
## 5 GSP1-NAT  YKR082W      4.43 YLR293C GSP1           NUP133           
## 6 GSP1-NAT  YML062C      2.66 YLR293C GSP1           MFT1             
## 7 GSP1-NAT  YML103C      3.12 YLR293C GSP1           NUP188           
## # ... with 4 more variables: gene_name <chr>, score_min <dbl>,
## #   score_max <dbl>, score_range <dbl>
###### there is only 7 library genes that are outside of s.lim.point for the WT construct, and 3 of those are nuclear pore proteins
#### they all also have very large score distributions in mutants
###### I would assume it's not a good idea to discard those
##### how does the distribution of scores look like for those library genes ?
ubermap.mutants.only %>%
  filter(library.gene_name %in% sig.interactions.in.wt$library.gene_name) %>%
  ggplot(aes(x = score)) +
  facet_wrap(~library.gene_name) + geom_histogram() + ggtitle("Distribution of E-MAP scores")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).

###### NOTE: SEM1, SAC3, MFT1 and THP2 are part of THO/TREX complexes
##### Is this relevant?
#### In any case, those 7 library genes all have very broad distribution of scores in mutants, 
#### so don't discard any of them

(ubergenes.ubermap.gene_names_both_sig <- ubergenes.ubermap.gene_names_both %>%
  filter(library.ORF %in% significant.library.genes))
## # A tibble: 4,364,259 x 6
##    Gene_uniq   library.ORF    score ORF    Gene.gene_name library.gene_na…
##    <chr>       <chr>          <dbl> <chr>  <chr>          <chr>           
##  1 YLR268W     YAL002W       1.28   YLR26… SEC22          VPS8            
##  2 YOR011W     YAL002W      -0.137  YOR01… AUS1           VPS8            
##  3 YAL008W     YAL002W      NA      YAL00… FUN14          VPS8            
##  4 YOR043W     YAL002W      -0.915  YOR04… WHI2           VPS8            
##  5 YBR255W     YAL002W      -1.89   YBR25… MTC4           VPS8            
##  6 YBR128C     YAL002W       0.174  YBR12… ATG14          VPS8            
##  7 YDL214C     YAL002W       0.0256 YDL21… PRR2           VPS8            
##  8 YNL300W     YAL002W       0.509  YNL30… TOS6           VPS8            
##  9 YIL034C     YAL002W       1.31   YIL03… CAP2           VPS8            
## 10 YHR083W_TS… YAL002W      -0.240  YHR08… SAM35          VPS8            
## # ... with 4,364,249 more rows
write_tsv(ubergenes.ubermap.gene_names_both_sig, path = "basic_E-MAP_data/preprocessed_ubermap_ubergenes_only_significant.txt")

#### merge the ubergenes.ubermap.gene_names_both and ubermap.mutants.only into one tibble

(combined.emap.data <- bind_rows(ubermap.mutants.only, ubergenes.ubermap.gene_names_both))
## # A tibble: 5,996,950 x 6
##    Gene_uniq     library.ORF   score ORF   Gene.gene_name library.gene_na…
##    <chr>         <chr>         <dbl> <chr> <chr>          <chr>           
##  1 A180T         YAL002W      0.974  YLR2… GSP1           VPS8            
##  2 CTER3XFLAG WT YAL002W     -0.616  YLR2… GSP1           VPS8            
##  3 D79A          YAL002W      0.465  YLR2… GSP1           VPS8            
##  4 D79S          YAL002W      0.357  YLR2… GSP1           VPS8            
##  5 E115A         YAL002W     -0.740  YLR2… GSP1           VPS8            
##  6 E115I         YAL002W     -1.85   YLR2… GSP1           VPS8            
##  7 F58A          YAL002W      0.587  YLR2… GSP1           VPS8            
##  8 F58L          YAL002W     -0.189  YLR2… GSP1           VPS8            
##  9 G80A          YAL002W     -0.0453 YLR2… GSP1           VPS8            
## 10 GSP1-NAT      YAL002W      1.74   YLR2… GSP1           VPS8            
## # ... with 5,996,940 more rows
tail(combined.emap.data)
## # A tibble: 6 x 6
##   Gene_uniq library.ORF  score ORF     Gene.gene_name library.gene_name
##   <chr>     <chr>        <dbl> <chr>   <chr>          <chr>            
## 1 YDR247W   YPR189W      0.582 YDR247W VHS1           SKI3             
## 2 YHR086W   YPR189W     -0.660 YHR086W NAM8           SKI3             
## 3 YBR015C   YPR189W      0.634 YBR015C MNN2           SKI3             
## 4 YML055W   YPR189W      0.801 YML055W SPC2           SKI3             
## 5 YGL190C   YPR189W      2.96  YGL190C CDC55          SKI3             
## 6 YLR176C   YPR189W     -0.154 YLR176C RFX1           SKI3
write_tsv(combined.emap.data, path = "basic_E-MAP_data/preprocessed_ubermap_all.txt")
###### only significant subset
(combined.emap.data_sig <- combined.emap.data %>%
    filter(library.ORF %in% significant.library.genes))
## # A tibble: 4,421,902 x 6
##    Gene_uniq     library.ORF   score ORF   Gene.gene_name library.gene_na…
##    <chr>         <chr>         <dbl> <chr> <chr>          <chr>           
##  1 A180T         YAL002W      0.974  YLR2… GSP1           VPS8            
##  2 CTER3XFLAG WT YAL002W     -0.616  YLR2… GSP1           VPS8            
##  3 D79A          YAL002W      0.465  YLR2… GSP1           VPS8            
##  4 D79S          YAL002W      0.357  YLR2… GSP1           VPS8            
##  5 E115A         YAL002W     -0.740  YLR2… GSP1           VPS8            
##  6 E115I         YAL002W     -1.85   YLR2… GSP1           VPS8            
##  7 F58A          YAL002W      0.587  YLR2… GSP1           VPS8            
##  8 F58L          YAL002W     -0.189  YLR2… GSP1           VPS8            
##  9 G80A          YAL002W     -0.0453 YLR2… GSP1           VPS8            
## 10 GSP1-NAT      YAL002W      1.74   YLR2… GSP1           VPS8            
## # ... with 4,421,892 more rows
write_tsv(combined.emap.data_sig, path = "basic_E-MAP_data/preprocessed_ubermap_all_sig.txt")

######### non merged ubermap
not.merged.ubermap <- read_tsv("basic_E-MAP_data/ORF_names_ubermap_not_merged.txt",  col_names = T)
## Warning: Duplicated column names deduplicated: 'YDR113C' =>
## 'YDR113C_1' [911], 'YLR350W' => 'YLR350W_1' [991], 'YDR432W' =>
## 'YDR432W_1' [1216], 'YGR108W' => 'YGR108W_1' [1303], 'YDR113C' =>
## 'YDR113C_2' [1312], 'YLR350W' => 'YLR350W_2' [1347], 'YGR109C' =>
## 'YGR109C_1' [2039], 'YDR432W' => 'YDR432W_2' [2523], 'YDR113C' =>
## 'YDR113C_3' [2725], 'YER008C' => 'YER008C_1' [2736], 'YPR120C' =>
## 'YPR120C_1' [2823], 'YGR038W' => 'YGR038W_1' [3084], 'YBR160W' =>
## 'YBR160W_1' [3308], 'YBR088C' => 'YBR088C_1' [3371], 'YPR119W' =>
## 'YPR119W_1' [3503], 'YDL155W' => 'YDL155W_1' [3541], 'YDL084W' =>
## 'YDL084W_1' [3548], 'YDR432W' => 'YDR432W_3' [3567], 'YGR038W' =>
## 'YGR038W_2' [3737], 'YJL085W' => 'YJL085W_1' [4140], 'YBR088C' =>
## 'YBR088C_2' [4448], 'YPL153C' => 'YPL153C_1' [4596]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene = col_character()
## )
## See spec(...) for full column specifications.
#### warning message when reading in the non-merged ubermap
# Warning message:
#   Duplicated column names deduplicated: 'YDR113C' => 'YDR113C_1' [911], 'YLR350W' => 'YLR350W_1' [991], 'YDR432W' => 'YDR432W_1' [1216], 'YGR108W' => 'YGR108W_1' [1303], 'YDR113C' => 'YDR113C_2' [1312], 'YLR350W' => 'YLR350W_2' [1347], 'YGR109C' => 'YGR109C_1' [2039], 'YDR432W' => 'YDR432W_2' [2523], 'YDR113C' => 'YDR113C_3' [2725], 'YER008C' => 'YER008C_1' [2736], 'YPR120C' => 'YPR120C_1' [2823], 'YGR038W' => 'YGR038W_1' [3084], 'YBR160W' => 'YBR160W_1' [3308], 'YBR088C' => 'YBR088C_1' [3371], 'YPR119W' => 'YPR119W_1' [3503], 'YDL155W' => 'YDL155W_1' [3541], 'YDL084W' => 'YDL084W_1' [3548], 'YDR432W' => 'YDR432W_3' [3567], 'YGR038W' => 'YGR038W_2' [3737], 'YJL085W' => 'YJL085W_1' [4140], 'YBR088C' => 'YBR088C_2' [4448], 'YPL153C' => 'YPL153C_1' [4596] 
### tidyverse sorts out duplicated library genes successfully 
not.merged.ubermap %>%
  ggplot(aes(x = YDR113C_1, y = YDR113C)) + geom_point()
## Warning: Removed 4702 rows containing missing values (geom_point).

#### can't gather before sorting out the Genes that occur multiple times (add a unique identifier to them)
### first find those Genes that are not unique (probably data from separate experiments/labs for the same thing? Stll better not to mix them up)
(not_uniq <- not.merged.ubermap %>% group_by(Gene) %>% 
    summarize("count" = n()) %>% filter(count > 1))
## # A tibble: 15 x 2
##    Gene    count
##    <chr>   <int>
##  1 YBR088C     3
##  2 YBR160W     2
##  3 YDL084W     2
##  4 YDL155W     2
##  5 YDR113C     4
##  6 YDR432W     4
##  7 YER008C     2
##  8 YGR038W     3
##  9 YGR108W     2
## 10 YGR109C     2
## 11 YJL085W     2
## 12 YLR350W     3
## 13 YPL153C     2
## 14 YPR119W     2
## 15 YPR120C     2
for (temp.gene in not_uniq$Gene) {
  row.names <- which(not.merged.ubermap$Gene == temp.gene)
  for (i in seq_along(row.names)) {
    not.merged.ubermap$Gene[row.names[i]] <- paste(i, not.merged.ubermap$Gene[row.names[i]], sep = "_")
  }
}
### now those few non-unique Genes look like this: 1_YDL155W - YDL155W
#### check
filter(not.merged.ubermap, Gene == "1_YJL085W" | Gene == "2_YJL085W")
## # A tibble: 2 x 5,398
##   Gene      YLR268W YOR011W YLR359W_DAmP YAL008W YOR043W YBR255W YBR128C
##   <chr>       <dbl>   <dbl>        <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 1_YJL085W    2.00  0.0066        -1.13 -0.0527   NA      NA       1.13
## 2 2_YJL085W   NA    -0.342         NA    -1.91     -4.02    1.13    1.32
## # ... with 5,390 more variables: YDL214C <dbl>, YNL300W <dbl>,
## #   YIL034C <dbl>, YHR083W_TSQ493 <dbl>, YGR164W <dbl>, YGR131W <dbl>,
## #   YLR144C <dbl>, YNL003C <dbl>, YBR160W <dbl>, YJL082W <dbl>,
## #   YIL044C <dbl>, YJL142C <dbl>, YPL191C <dbl>, YGL008C_DAmP <dbl>,
## #   YGL215W <dbl>, YDR312W <dbl>, YLR348C <dbl>, YKL074C <dbl>,
## #   YNR067C <dbl>, YJL077C <dbl>, `YKL096W-A` <dbl>, YFL010C <dbl>,
## #   YDR141C <dbl>, YIL124W <dbl>, YPL039W <dbl>, YJL081C_DAmP <dbl>,
## #   YKL048C <dbl>, YDR083W <dbl>, YIR013C <dbl>, YPL071C <dbl>,
## #   YKL183W <dbl>, YPL024W <dbl>, YKL134C <dbl>, YGR063C <dbl>,
## #   YKL029C <dbl>, YPL166W <dbl>, YLR220W <dbl>, YBR273C <dbl>,
## #   YHR025W <dbl>, YOR313C <dbl>, YAL026C <dbl>, YHR007C_TSQ14 <dbl>,
## #   YBL008W <dbl>, YGL237C <dbl>, YCL023C <dbl>, YAR035W <dbl>,
## #   YLR145W_DAmP <dbl>, YOL129W <dbl>, YHR130C <dbl>, YOL116W <dbl>,
## #   YBR182C <dbl>, YGL044C_DAmP <dbl>, YDR196C <dbl>, YBR280C <dbl>,
## #   YNR032W <dbl>, YML105C <dbl>, YNL288W <dbl>, YPL104W <dbl>,
## #   YIL059C <dbl>, YLR253W <dbl>, YDL140C_DAmP <dbl>, YNR018W <dbl>,
## #   YPL144W <dbl>, YFR047C <dbl>, YBR215W <dbl>, YGL142C_DAmP <dbl>,
## #   YHR078W <dbl>, YGR084C <dbl>, YJR115W <dbl>, YCR002C <dbl>,
## #   YGL157W <dbl>, YBL015W <dbl>, YLR300W <dbl>, YBL003C <dbl>,
## #   YJR140C <dbl>, YGR146C <dbl>, YOR033C <dbl>, YOR062C <dbl>,
## #   YOR290C_DAMP <dbl>, YBR043C <dbl>, YHR112C <dbl>, YNR007C <dbl>,
## #   YLR011W <dbl>, YFL008W_TSQ321 <dbl>, YKR103W <dbl>, YKR084C <dbl>,
## #   YER121W <dbl>, YLR059C <dbl>, YDR059C <dbl>, YOR240W <dbl>,
## #   YLR105C_DAmP <dbl>, YKL204W <dbl>, YGR291C <dbl>, YPL248C <dbl>,
## #   YMR233W <dbl>, YIR022W_DAmP <dbl>, YDR393W <dbl>, YPR194C <dbl>,
## #   YGR019W <dbl>, YGL244W <dbl>, …
# not_uniq is now empty
(not_uniq <- not.merged.ubermap %>% group_by(Gene) %>% 
    summarize("count" = n()) %>% filter(count > 1))
## # A tibble: 0 x 2
## # ... with 2 variables: Gene <chr>, count <int>
not.merged.ubermap <- not.merged.ubermap %>% 
  gather(library.ORF, score, -Gene) %>%
  drop_na(score) %>%
  rename("query_uniq" = "Gene") %>%
  mutate("query.ORF" = gsub(query_uniq, pattern = " - DELTA", replacement = " -- DELTA", perl = T)) %>% ##### this is to make sure things like YLR337C - DELTA YLR338W - YLR337C - DELTA YLR338W are not collapse to just YLR337C in the next step
  mutate("query.ORF" = gsub(query.ORF, pattern = "^[0-9]+_", replacement = "", perl = T)) %>%
  mutate("query.ORF" = gsub(query.ORF, pattern = "_.+$", replacement = "", perl = T))
filter(not.merged.ubermap, query_uniq != query.ORF)
## # A tibble: 1,535,738 x 4
##    query_uniq   library.ORF  score query.ORF
##    <chr>        <chr>        <dbl> <chr>    
##  1 YGL008C_DAmP YLR268W      1.22  YGL008C  
##  2 YJL081C_DAmP YLR268W      0.645 YJL081C  
##  3 YDL140C_DAmP YLR268W     -2.48  YDL140C  
##  4 YGL142C_DAmP YLR268W      0.309 YGL142C  
##  5 YDL165W_DAmP YLR268W     -0.781 YDL165W  
##  6 YDR141C_DAmP YLR268W     -8.36  YDR141C  
##  7 YDR407C_DAmP YLR268W      0.584 YDR407C  
##  8 YMR236W_DAmP YLR268W      3.47  YMR236W  
##  9 YJL034W_DAmP YLR268W      0.118 YJL034W  
## 10 YDR190C_DAmP YLR268W     -6.28  YDR190C  
## # ... with 1,535,728 more rows
not.merged.ubermap.gene.names <- inner_join(not.merged.ubermap, orf_index, by = c("query.ORF" = "orf")) %>%
  rename("query_gene_name" = "gene_name")

#### confirm that all query_uniq are unique
##########################
# ubermap_per_lib_orf_count <- not.merged.ubermap.gene.names %>% group_by(library.ORF) %>%
#   summarise("gene_count" = n())
# unique(ubermap_per_lib_orf_count$gene_count)
# ubermap_per_gene_count <- not.merged.ubermap.gene.names %>% group_by(query_uniq) %>%
#   summarise("lib_count" = n())
# unique(ubermap_per_gene_count$lib_count)
# ubermap_per_gene_count %>% 
#   filter(lib_count > mean(ubermap_per_gene_count$lib_count)) %>%
#   pull(query_uniq)
# ubermap_per_gene_count %>% filter(lib_count == 3072)
save(not.merged.ubermap.gene.names, file = "basic_E-MAP_data/ubermap_not_merged.RData")
##########################################

#### preprocess histone point mutations data
histones <- read_delim("basic_E-MAP_data/20180226_histones_pE-MAPs.txt", delim = "\t", col_names = T)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene = col_character()
## )
## See spec(...) for full column specifications.
histones <- histones %>% gather(library.gene_name, score, -Gene)
write_tsv(histones, path = "basic_E-MAP_data/histones_pE-MAPs.txt")